{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import re\n",
    "from lxml import html\n",
    "import time\n",
    "\n",
    "class MyCrawler:\n",
    "    def __init__(self, filename):\n",
    "        self.filename = filename\n",
    "        self.headers = {\n",
    "            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4128.3 Safari/537.36'\n",
    "        }\n",
    "\n",
    "    def download(self, url: str):\n",
    "        r = requests.get(url, headers=self.headers)\n",
    "        return r.text\n",
    "\n",
    "    def extract(self, content, pattern):\n",
    "        res = re.findall(pattern, content)\n",
    "        return res\n",
    "\n",
    "    def save(self, info):\n",
    "        with open(self.filename, 'a', encoding='utf-8') as f:\n",
    "            for item in info:\n",
    "                f.write(''.join(item) + '\\n') \n",
    "\n",
    "    def crawl(self, url: str, pattern: str, headers=None):\n",
    "        if headers:\n",
    "            self.headers.update(headers)\n",
    "        content = self.download(url)\n",
    "        info = self.extract(content, pattern)\n",
    "        self.save(info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "['https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T',\n 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T',\n 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T',\n 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=60&type=T',\n 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=80&type=T',\n 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=100&type=T',\n 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=120&type=T',\n 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=140&type=T',\n 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=160&type=T',\n 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=180&type=T']"
     },
     "metadata": {},
     "execution_count": 2
    }
   ],
   "source": [
    "urls = [f'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start={start_id}&type=T' for start_id in range(0, 200, 20)]\n",
    "urls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=20&type=T' page is 52972 bytes\n'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=80&type=T' page is 52984 bytes\n'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0&type=T' page is 52891 bytes\n'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=40&type=T' page is 54057 bytes\n'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=60&type=T' page is 52794 bytes\n'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=100&type=T' page is 52683 bytes\n'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=120&type=T' page is 53638 bytes\n'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=180&type=T' page is 53970 bytes\n'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=140&type=T' page is 54064 bytes\n'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=160&type=T' page is 53494 bytes\nWall time: 1.02 s\n"
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "import concurrent.futures\n",
    "import requests\n",
    "\n",
    "douban_crawler = MyCrawler('douban.txt')\n",
    "\n",
    "def load_url(url):\n",
    "    global douban_crawler\n",
    "    return douban_crawler.download(url)\n",
    "\n",
    "with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:\n",
    "    \n",
    "    future_to_url = {executor.submit(load_url, url): url for url in urls}\n",
    "    for future in concurrent.futures.as_completed(future_to_url):\n",
    "        url = future_to_url[future]\n",
    "        try:\n",
    "            data = future.result()\n",
    "        except Exception as exc:\n",
    "            print('%r generated an exception: %s' % (url, exc))\n",
    "        else:\n",
    "            print('%r page is %d bytes' % (url, len(data)))\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}